Clasificador - hito 2

In [121]:
import pandas as pd
import numpy as np
import pickle
from string import punctuation
 
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="whitegrid")
plt.rc('axes', titlesize=14)
plt.rc('legend', fontsize=14)
plt.rc('xtick', labelsize=12)
plt.rc('ytick', labelsize=12)
plt.rcParams.update({'font.size': 16})
plt.rcParams['axes.titlesize'] = 16
plt.rcParams["figure.figsize"] = (10, 6)
plt.rcParams.update({'lines.markeredgewidth': 1})
plt.rcParams.update({'errorbar.capsize': 2})
import plotly
import plotly.express as px

file_names = {
    "df_es_mapping": "../../Data/mapping/df_es_mapping.pickle",
    "df_us_mapping": "../../Data/mapping/df_us_mapping.pickle",
    
    "df_es_test": "../../Data/test/df_es_test.pickle",
    "df_us_test": "../../Data/test/df_us_test.pickle",
    
    "df_es_train": "../../Data/train/df_es_train.pickle",
    "df_us_train": "../../Data/train/df_us_train.pickle",
    
    "df_es_trial": "../../Data/trial/df_es_trial.pickle",
    "df_us_trial": "../../Data/trial/df_us_trial.pickle",
}

# mas imports

import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import TweetTokenizer # tokenizer especial para tweets
tt = TweetTokenizer()
# nltk.download('stopwords')
from nltk.corpus import stopwords
from string import punctuation

import plotly as ply
import plotly.graph_objects as go
import numpy as np

cargar sets

In [2]:
df_es_train = pickle.load(open(file_names["df_es_train"], "rb"))
df_es_trial = pickle.load(open(file_names["df_es_trial"], "rb"))
df_es_test = pickle.load(open(file_names["df_es_test"], "rb"))

df_us_train = pickle.load(open(file_names["df_us_train"], "rb"))
df_us_trial = pickle.load(open(file_names["df_us_trial"], "rb"))
df_us_test = pickle.load(open(file_names["df_us_test"], "rb"))

pre-procesamiento

In [3]:
df_us_train['tokenized_text'] = df_us_train['text'].str.lower().apply(lambda x: " ".join(tt.tokenize(x)))
df_us_train.head()
Out[3]:
id text label tokenized_text
0 729044324441186304 Selfies for summatime @ Drexel University 12 selfies for summatime @ drexel university
1 663834134037442560 Ready to be a bulldog with rasso #hailstate #i... 14 ready to be a bulldog with rasso #hailstate #i...
2 747449193350963200 #scored my new #matcotools #slidehammer weight... 16 #scored my new #matcotools #slidehammer weight...
3 691439672761925637 @user last night was so much fun @ Skyway Thea... 6 @user last night was so much fun @ skyway theatre
4 758118895618109440 love beach days @ Manasquan Beach 12 love beach days @ manasquan beach
In [4]:
df_us_test['tokenized_text'] = df_us_test['text'].str.lower().apply(lambda x: " ".join(tt.tokenize(x)))
In [5]:
"""stopwords_en_withpunct = set(stopwords_en).union(set(punctuation))
print(list(stopwords_en_withpunct)[:10])""";
In [6]:
from sklearn.feature_extraction.text import CountVectorizer
In [7]:
vectorizer = CountVectorizer(min_df=5)
X_train_bow = vectorizer.fit_transform(df_us_train["tokenized_text"])
X_test_bow = vectorizer.transform(df_us_test["tokenized_text"])
In [8]:
from sklearn.naive_bayes import MultinomialNB
In [9]:
clf = MultinomialNB()
clf.fit(X_train_bow, df_us_train["label"])
Out[9]:
MultinomialNB()
In [10]:
clf.score(X_train_bow, df_us_train["label"])
Out[10]:
0.39701310639001064
In [11]:
from sklearn.metrics import classification_report
In [12]:
df_us_mapping = pickle.load(open(file_names["df_us_mapping"], "rb")).sort_values("label")
df_us_mapping
Out[12]:
label emoji name
0 0 _red_heart_
1 1 😍 _smiling_face_with_hearteyes_
10 10 📷 _camera_
11 11 🇺🇸 _United_States_
12 12 _sun_
13 13 💜 _purple_heart_
14 14 😉 _winking_face_
15 15 💯 _hundred_points_
16 16 😁 _beaming_face_with_smiling_eyes_
17 17 🎄 _Christmas_tree_
18 18 📸 _camera_with_flash_
19 19 😜 _winking_face_with_tongue_
2 2 😂 _face_with_tears_of_joy_
3 3 💕 _two_hearts_
4 4 🔥 _fire_
5 5 😊 _smiling_face_with_smiling_eyes_
6 6 😎 _smiling_face_with_sunglasses_
7 7 _sparkles_
8 8 💙 _blue_heart_
9 9 😘 _face_blowing_a_kiss_
In [13]:
y_pred = clf.predict(X_test_bow)
print(classification_report(df_us_test["label"], y_pred, target_names=df_us_mapping["emoji"]))
              precision    recall  f1-score   support

           ❤       0.35      0.58      0.44     10798
           😍       0.25      0.25      0.25      4830
           📷       0.16      0.16      0.16      1432
          🇺🇸       0.47      0.50      0.48      1949
           ☀       0.25      0.43      0.32      1265
           💜       0.32      0.05      0.08      1114
           😉       0.12      0.04      0.06      1306
           💯       0.27      0.14      0.19      1244
           😁       0.14      0.03      0.05      1153
           🎄       0.60      0.60      0.60      1545
           📸       0.29      0.10      0.15      2417
           😜       0.04      0.01      0.01      1010
           😂       0.30      0.52      0.38      4534
           💕       0.19      0.05      0.08      2605
           🔥       0.45      0.47      0.46      3716
           😊       0.09      0.06      0.07      1613
           😎       0.16      0.11      0.13      1996
           ✨       0.29      0.18      0.22      2749
           💙       0.22      0.07      0.10      1549
           😘       0.16      0.05      0.08      1175

    accuracy                           0.32     50000
   macro avg       0.26      0.22      0.22     50000
weighted avg       0.29      0.32      0.28     50000

In [14]:
vocab = {k: v for v, k in enumerate(vectorizer.get_feature_names_out())}
In [15]:
vec_test = np.zeros(X_train_bow.shape[1])
k = vocab["santa"]
vec_test[k] = 1
print(vectorizer.inverse_transform([vec_test])[0][0])
clf.predict_proba([vec_test])
santa
Out[15]:
array([[0.21267139, 0.10559105, 0.03170409, 0.02263593, 0.06533366,
        0.01011697, 0.02992065, 0.00932074, 0.02055197, 0.11856118,
        0.02276163, 0.02159743, 0.10988494, 0.03433423, 0.01711626,
        0.03922043, 0.05487754, 0.02576475, 0.02707579, 0.02095938]])

Top palabras por emoji

In [16]:
%%time
vocab_length = X_train_bow.shape[1]
proba_matrix = np.array([clf.predict_proba(np.eye(1,vocab_length,k))[0] for k in range(vocab_length)])
Wall time: 18.5 s
In [17]:
print(vocab_length)
print(proba_matrix.shape)
29983
(29983, 20)
In [18]:
una_linea = proba_matrix[:,3]
una_linea.shape
Out[18]:
(29983,)
In [19]:
def topPalabras(proba_matrix,emoji_id,k=5):
    # retorna las palabras para las cuales el emoji en cuestión tiene mas probabilidad
    prob = proba_matrix[:,emoji_id]  # mmm
    ind = np.argpartition(prob,-k)[-k:]
    val = prob[ind]
    palabras = [vectorizer.inverse_transform([np.eye(1,vocab_length,k)[0]])[0][0] for k in ind]
    return palabras, val
In [22]:
i = 9
map_emojis = [0,1,10,11,12,13,14,15,16,17,18,19,2,3,4,5,6,7,8,9]
print(df_us_mapping["emoji"][map_emojis[i]])
topPalabras(proba_matrix,i)
🎄
Out[22]:
(['tree', 'tis', 'christmas2015', 'merry', 'christmastree'],
 array([0.58739394, 0.62045562, 0.59251379, 0.59215716, 0.78961278]))
In [21]:
for i in range(20):
    print(df_us_mapping["emoji"][map_emojis[i]])
    pal, val = topPalabras(proba_matrix,i)
    print(dict([(pal[j],val[j]) for j in range(len(pal))]))
❤
{'heart': 0.5761150508287713, 'valentines': 0.5811796285055364, 'lovemyfamily': 0.5793072574283205, 'valentine': 0.5949253260518093, 'loveofmylife': 0.6651429760457039}
😍
{'inlove': 0.4288496011644503, 'gorg': 0.4453369489332203, 'obsessed': 0.44883209548006375, 'swoon': 0.45600734120528374, 'swooning': 0.4562752924146042}
📷
{'sony': 0.3882939691961041, 'gdlfashion': 0.4052054276523981, 'bvillain': 0.5117380913776236, 'shredforaliving': 0.42318456197855087, 'kae': 0.4156412082407767}
🇺🇸
{'murica': 0.7800742137338681, 'imwithher': 0.8062308003990193, 'election2016': 0.8313563588022287, 'ivoted': 0.8638840691974496, 'merica': 0.8821097001791353}
☀
{'soakin': 0.447527739339606, 'sun': 0.45989390878761877, 'sunny': 0.4740267973356278, 'sunshine': 0.5826999203405687, 'beachin': 0.4927352286841576}
💜
{'snyder': 0.3559341490500346, 'ripprince': 0.3575182172656461, 'endalz': 0.4398786494003444, 'purple': 0.5506017875276537, 'purplerain': 0.5134869871082338}
😉
{'backtowork': 0.2129172475035178, 'wink': 0.23697450673018283, 'azek': 0.23901373031599815, 'silvercriketgentlemensclub': 0.23901373031599815, 'mividaesunatombola': 0.3304591357770921}
💯
{'t3t': 0.39850650010969246, 'keepin': 0.4169922638434261, 'facts': 0.5848465289708452, 'realtalk': 0.5061620053576262, 'rns': 0.47800047055538203}
😁
{'dentist': 0.239497184608953, 'dentistry': 0.2741579803483271, 'cheesin': 0.28598242907316934, 'braces': 0.2908595553667064, 'djsty': 0.29970001389656153}
🎄
{'tree': 0.5873939423592781, 'tis': 0.6204556185878153, 'christmas2015': 0.5925137901344447, 'merry': 0.5921571580626953, 'christmastree': 0.7896127824106279}
📸
{'cred': 0.3159050632200524, 'headshot': 0.3241125570492244, 'mag': 0.3337009742052551, 'opus': 0.4241583818170979, 'bricks': 0.34874909576037366}
😜
{'burpees': 0.16219480409077416, 'jewelrydesigner': 0.17063599291164733, 'wacky': 0.18374154943712573, 'silly': 0.19345012616245877, 'cray': 0.22352568350963714}
😂
{'funny': 0.709298339443829, 'wtf': 0.7376276034392985, 'lmfao': 0.8387500617307875, 'lmao': 0.8600667188416347, 'hilarious': 0.7794513567711954}
💕
{'endorsement': 0.32348853465380645, 'lovealwaysyje': 0.33034960065978825, 'pink': 0.3544185518163782, 'strides': 0.3722537471026334, 'breast': 0.44012548234851656}
🔥
{'flame': 0.6954472080681913, 'flames': 0.7013605810434306, 'fire': 0.702586754042741, 'mixtape': 0.7155634858236927, 'lit': 0.7075956742171502}
😊
{'worlds2016': 0.27632029133395997, '7171': 0.2814042655159173, 'bagsbycab': 0.28184206439355336, '3037': 0.29394808945050926, '802': 0.2838469184935355}
😎
{'beautique': 0.3566104881826646, 'shades': 0.4286315258210222, 'sunglasses': 0.5538250850510409, 'coolin': 0.3872564798469744, 'eyewear': 0.3732911260893004}
✨
{'getonshimmur': 0.38076409557547336, 'sparkle': 0.55606315222639, 'glitter': 0.3824542997618393, 'magical': 0.4041663529709088, 'pixie': 0.41945855964929035}
💙
{'rupp': 0.39496130878522095, 'foreverroyal': 0.43868663666132685, 'royals': 0.43887929760921546, 'autism': 0.4912285850528072, 'bbn': 0.44463409096380896}
😘
{'kissy': 0.2835098942218612, 'kiss': 0.2998014122533109, 'kisses': 0.3778843629927282, 'smooches': 0.3331135467798145, 'princessmailyana': 0.28949959973025136}
In [29]:
!pip install umap-learn
import umap
Requirement already satisfied: umap-learn in c:\users\felip\anaconda3\lib\site-packages (0.5.2)
Requirement already satisfied: pynndescent>=0.5 in c:\users\felip\anaconda3\lib\site-packages (from umap-learn) (0.5.4)
Requirement already satisfied: scipy>=1.0 in c:\users\felip\anaconda3\lib\site-packages (from umap-learn) (1.6.2)
Requirement already satisfied: numba>=0.49 in c:\users\felip\anaconda3\lib\site-packages (from umap-learn) (0.54.1)
Requirement already satisfied: scikit-learn>=0.22 in c:\users\felip\anaconda3\lib\site-packages (from umap-learn) (1.0.2)
Requirement already satisfied: numpy>=1.17 in c:\users\felip\anaconda3\lib\site-packages (from umap-learn) (1.20.3)
Requirement already satisfied: tqdm in c:\users\felip\anaconda3\lib\site-packages (from umap-learn) (4.63.0)
Requirement already satisfied: setuptools in c:\users\felip\anaconda3\lib\site-packages (from numba>=0.49->umap-learn) (61.2.0)
Requirement already satisfied: llvmlite<0.38,>=0.37.0rc1 in c:\users\felip\anaconda3\lib\site-packages (from numba>=0.49->umap-learn) (0.37.0)
Requirement already satisfied: joblib>=0.11 in c:\users\felip\anaconda3\lib\site-packages (from pynndescent>=0.5->umap-learn) (1.1.0)
Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\felip\anaconda3\lib\site-packages (from scikit-learn>=0.22->umap-learn) (2.2.0)
Requirement already satisfied: colorama in c:\users\felip\appdata\roaming\python\python38\site-packages (from tqdm->umap-learn) (0.4.3)
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
RuntimeError: module compiled against API version 0xe but this version of numpy is 0xd
In [30]:
reducer = umap.UMAP()
to_R2 = reducer.fit_transform(proba_matrix)
to_R2.shape
Out[30]:
(29983, 2)
In [57]:
df_umap = pd.DataFrame(to_R2)
df_umap["token"] = vectorizer.get_feature_names_out()
df_umap["label"] = np.argmax(proba_matrix, axis=1).astype(str)
df_umap["proba"] = np.max(proba_matrix, axis=1)
df_umap = df_umap.merge(df_us_mapping, on="label", how="left")
df_umap
Out[57]:
0 1 token label proba emoji name
0 6.465280 7.928913 00 0 0.160577 _red_heart_
1 7.173022 9.377040 000 0 0.144546 _red_heart_
2 11.245706 6.560040 001 2 0.119453 😂 _face_with_tears_of_joy_
3 9.197347 8.636217 004 10 0.116191 📷 _camera_
4 7.833223 7.862528 005 2 0.114231 😂 _face_with_tears_of_joy_
... ... ... ... ... ... ... ...
29978 10.340827 8.301856 σχ 7 0.116272 _sparkles_
29979 7.687988 12.638694 アメリカ 0 0.187796 _red_heart_
29980 7.981842 14.905180 留学 0 0.288069 _red_heart_
29981 7.168561 10.921856 뉴욕 0 0.131694 _red_heart_
29982 8.710210 8.963988 토론토 19 0.116800 😜 _winking_face_with_tongue_

29983 rows × 7 columns

In [135]:
data = []
for label in df_us_mapping["label"]:
    sub_df = df_umap[df_umap["label"] == label]
    data.append(
        go.Scattergl(
            x = sub_df[0],
            y = sub_df[1],
            mode='markers',
            text=sub_df["token"]+"<br>"+sub_df["emoji"]+"<br>"+sub_df["proba"].apply(lambda x: str(np.round(x, 3))),
            name=sub_df["emoji"].iloc[0],
            marker=dict(
                size=25*sub_df["proba"],
                line_width=0.2,
            )
        )
    )
    
fig = go.Figure(data=data)
fig.update_layout(
    title="Proyección (UMAP) de vectores de probabilidad de tokens",
    autosize=False,
    width=700,
    height=500,
)
fig.show(renderer="notebook")